##
## DescriptiveStatsAndGraphs.R
##   descriptive statistics and plots, and creation of probabilities and their sample variances
##   for Chiappori-Salanie-Weiss (AER), "Partner choice..."
##


rm(list = ls())

require(lattice)
require(dplyr)
require(tidyr)

##########  auxiliary functions   ####################################################

## print interesting quantiles
pQuantiles <- function(x) {
  dx <- deparse(substitute(x))
  cat("Interesting quantiles of", dx, ":\n")
  print(quantile(x, c(1, 5, 10, 25, 50, 75, 90, 95, 99) / 100))
}


## print line(s) with stars

pstarline <- function(nl = 1, str = '') {
  cat("\n")
  for (i in 1:nl) {
    cat("     ********************************************* \n")
  }
  if (str != '') {
    cat(str)
    pstarline()
  }
  cat("\n")
}


######################################################################################

## select 1 observation in
nselect <- 1

## we will drop everyone who is observed older than oldlim
oldlim <- 100

## we will drop couples with ages at marriage  smaller than husbminmarrage or wifeminmarrage
husbminmarrage <- 16
wifeminmarrage <- 16

## for singles, only keep the never married
onlyNeverMarried <- T

datadir <- "CSW-17-aer/Dataset/"

datacouples <- "CSWcouples0814.txt"
datasingles <- "CSWsingles0814.txt"

## I= education of man, J= education of woman
## year= of survey ; hhwt=household sampling weight

## for couples: year, hhwt, husbrace, wiferace, I, J, age-man, age-woman, year-marriage
bigdcouples <-
  read.table(file = paste(datadir, datacouples, sep = ''),
             header = F)
names(bigdcouples) <-
  c(
    "YearSurvey",
    "HHweight",
    "RaceHusband",
    "RaceWife",
    "EducHusband",
    "EducWife",
    "AgeHusband",
    "AgeWife",
    "YearMarried"
  )
##    and we add marst=1
bigdcouples$MarriageStatus <- 1
varNames <- names(bigdcouples)

## for singles: year, hhwt, race, educ, age, sex (male 1, female 2), marst
bigdsingles <-
  read.table(file = paste(datadir, datasingles, sep = ''),
             header = F)
names(bigdsingles) <- c("YearSurvey",
                        "HHweight",
                        "Race",
                        "Education",
                        "Age",
                        "Gender",
                        "MarriageStatus")
if (onlyNeverMarried) {
  cat("\nDropping singles who ever married:\n")
  with(subset(bigdsingles, Gender == 2), sum(MarriageStatus == 6))
  with(subset(bigdsingles, Gender == 1), sum(MarriageStatus == 6))
  bigdsingles <- bigdsingles[bigdsingles$MarriageStatus == 6, ]
}




if (nselect > 1) {
  ncouples <- NROW(bigdcouples)
  bigdcouples <-
    bigdcouples[sample(ncouples, floor(ncouples / nselect)), ]
  nsingles <- NROW(bigdsingles)
  bigdsingles <-
    bigdsingles[sample(nsingles, floor(nsingles / nselect)), ]
  pstarline()
  cat("Selected one obs in", nselect, "\n")
  pstarline()
}

## groups
ng <- 5
namesgroups <-  c(
  "High School Dropout",
  "High School Graduate",
  "Some College",
  "College Graduate",
  "Postgraduate"
)
smallnamesgroups <-  c("HSD", "HSG", "SC", "CG", "CG+")

## keep only whites (1), or blacks (3)
codeRace <- c(1,3)
ageLim <- c(40,45)
racenamev <- c("Whites",  "Blacks")
raceadjv <- c("white", "black")
raceadjvs <- c("whites", "blacks")


mendata <- list()
womendata <- list()

for (irace in 1:2) {
#irace <- 1
selectRace <- codeRace[irace]
younglim <- ageLim[irace]
## we will drop everyone who is observed younger than younglim

reclassOld <- 0

##  if reClassOld=1, we will reclassify couples as never married couples
##     with ages at marriage larger than husbmaxmarrage or wifemaxmarrage
##
husbmaxmarrage <-  younglim
wifemaxmarrage <- younglim


racename <- racenamev[irace]
raceadj <- raceadjv[irace]
raceadjs <- raceadjvs[irace]

# pstarline()
# cat("Working on", racename, "; reclass=",  reclassOld,
#     "; younglim=", younglim, "\n")
# pstarline()

#resdir <- paste0(datadir, racename, "/")

# sinkfile <-
#   paste0(datadir, "descstatsgraphs", racename, younglim, ".txt")
# if (sinkit) {
#   sink(file = sinkfile)
# }


dcouples <- bigdcouples

dsingles <- bigdsingles



## select races
if (selectRace > 0) {
  racehusb <- dcouples$RaceHusband
  racewife <- dcouples$RaceWife
  racesingle <- dsingles$Race
  
  pstarline()
  cat("Keeping ", raceadjs, "only\n")
  pstarline()
  dsingles <- dsingles[racesingle == selectRace, ]
  dcouples <-
    dcouples[(racewife == selectRace & racehusb == selectRace), ]
}


## we drop marriages that occur within the year of the survey
yearsurvey  <- dcouples$YearSurvey
yearmarr <- dcouples$YearMarried
cat("\nDropping",  sum(yearsurvey <= yearmarr),  "marriages  (",
    100 * mean(yearsurvey <= yearmarr),
    "%) that occurred in the year of the survey.\n")
dcouples <- dcouples[(yearsurvey > yearmarr), ]

## for couples: ages at marriage
yearsurvey  <- dcouples$YearSurvey
yearmarr <- dcouples$YearMarried
agehusb <- dcouples$AgeHusband
agewife <- dcouples$AgeWife
agemarrhusb <- agehusb - (yearsurvey - yearmarr)
agemarrwife <- agewife - (yearsurvey - yearmarr)

## construct data

## select on current ages

selagesinterval <-
  ((agehusb >= younglim) & (agehusb <= oldlim) &
     (agewife >= younglim) &
     (agewife <= oldlim))
dcouples <- dcouples[selagesinterval, ]
pstarline()
cat("Dropped ", sum(!selagesinterval), " couples (",  100 * mean(!selagesinterval),
    "% of sample) with a partner younger than ",  younglim,
    " or older than ", oldlim, ".\n")
pstarline()

## select on ages at marriage
yearsurvey  <- dcouples$YearSurvey
yearmarr <- dcouples$YearMarried
agemarrhusb <- dcouples$AgeHusband - (yearsurvey - yearmarr)
agemarrwife <- dcouples$AgeWife - (yearsurvey - yearmarr)
selagesminmarr <-
  (agemarrhusb >= husbminmarrage &  agemarrwife >= wifeminmarrage)
dcouples <- dcouples[selagesminmarr, ]
pstarline()
cat("Dropped ", sum(!selagesminmarr),  " couples (",
    100 * mean(!selagesminmarr),  "% of sample) with a partner below minimum age.\n")
cat("   (minimum ages=", husbminmarrage, "for men and ",  wifeminmarrage, "for women.)\n")
pstarline()


if (reclassOld == 1) {
  yearsurvey  <- dcouples$YearSurvey
  yearmarr <- dcouples$YearMarried
  agemarrhusb <- dcouples$AgeHusband - (yearsurvey - yearmarr)
  agemarrwife <- dcouples$AgeWife - (yearsurvey - yearmarr)
  
  selagesmaxmarr <-
    ((agemarrhusb <= husbmaxmarrage) &
       (agemarrwife <= wifemaxmarrage))
  reclasscouples <- dcouples[(!selagesmaxmarr), ]
  nreclass <- NROW(reclasscouples)
  ## couples with one partner outside of the marriage age window:
  ##                   we reclassify both partners as single
  ## for couples: year, hhwt, husbrace, wiferace, I, J, age-man, age-woman,
  ##    year-marriage, marst
  singlemenreclass <-
    data.frame(
      reclasscouples$YearSurvey,
      reclasscouples$HHweight,
      reclasscouples$RaceHusband,
      numeric(nreclass),
      reclasscouples$EducHusband,
      numeric(nreclass),
      reclasscouples$AgeHusband,
      matrix(0, nreclass, 2),
      reclasscouples$MarriageStatus
    )
  singlewomenreclass <-
    data.frame(
      reclasscouples$YearSurvey,
      reclasscouples$HHweight,
      numeric(nreclass),
      reclasscouples$RaceWife,
      numeric(nreclass),
      reclasscouples$EducWife,
      numeric(nreclass),
      reclasscouples$AgeWife,
      numeric(nreclass),
      reclasscouples$MarriageStatus
    )
  dcouples <- dcouples[selagesmaxmarr, ]
  pstarline()
  cat( "Reclassified ",  nreclass,
       " couples (",  100 * mean(!selagesmaxmarr),
       "% of sample) with a partner above maximum age.\n" )
  cat("   (maximum ages=", husbmaxmarrage,  "for men and ",
      wifemaxmarrage, "for women.)\n")
  pstarline()
}


yearsurvey  <- dcouples$YearSurvey
yearmarr <- dcouples$YearMarried
agemarrhusb <- dcouples$AgeHusband - (yearsurvey - yearmarr)
agemarrwife <- dcouples$AgeWife - (yearsurvey - yearmarr)

## sampling weights for couples
fw <- dcouples$HHweight


# cat("\n\nAges at marriage:\n")
# fwplot <- fw / mean(fw)
# tages <- xtabs(fwplot ~ agemarrhusb + agemarrwife)
# pdf(paste0(resdir, "AgesAtMarriage", selectRace, younglim, ".pdf"))
# plot(tages,
#      main = paste0("Ages at Marriage for ", raceadjs),
#      xlab = "Husband",
#      ylab = "Wife",
#      col = "lightgreen")
# dev.off()

# 
# diffOfAges <- agemarrhusb - agemarrwife
# birthhusb <- yearsurvey - dcouples$AgeHusband
# 
# 
# cat("\n Quantiles of age at marriage of husband:\n")
# pQuantiles(agemarrhusb)
# cat(" Quantiles of age at marriage of wife:\n")
# pQuantiles(agemarrwife)
# cat(" Quantiles of age difference at marriage:\n")
# pQuantiles(diffOfAges)
# earlycoh <- (birthhusb %in% (1940:1942))
# cat("  for early cohorts:\n")
# pQuantiles(diffOfAges[earlycoh])
# cat("  for late cohorts:\n")
# if (selectRace != 3) {
#   latecoh <- (birthhusb %in% (1968:1970))
# } else {
#   latecoh <- (birthhusb %in% (1963:1965))
# }
# pQuantiles(diffOfAges[latecoh])
# cat("\nProportions in the age difference cells for early cohorts:\n")
# cat("  <= -1:", mean(diffOfAges[earlycoh]  <= -1), "\n")
# for (idiff in 0:4) {
#   cat("  ==", idiff,  mean(diffOfAges[earlycoh]  ==  idiff), "\n")
# }
# cat("  >= 5:", mean(diffOfAges[earlycoh]  >= 5), "\n")
# cat("\nProportions in the age difference cells for late cohorts:\n")
# cat("  <= -1:", mean(diffOfAges[latecoh]  <= -1), "\n")
# for (idiff in 0:4) {
#   cat("  ==", idiff,  mean(diffOfAges[latecoh]  ==  idiff), "\n")
# }
# cat("  >= 5:", mean(diffOfAges[latecoh]  >= 5), "\n")
# 
# 
# ## histogram of age difference
# pdf(file = paste0(resdir, "histAgeDiff", selectRace, younglim, ".pdf"))
# hist(
#   agemarrhusb - agemarrwife,
#   breaks = 100,
#   main = paste0("Age difference between Spouses for ", raceadj, "s"),
#   xlab = "Age difference",
#   col = 3,
#   xlim = c(-15, 20)
# )
# abline(v = 0, lty = 2)
# dev.off()


# 
# ## window for plots
# begplot <- 1940
# endplot <- 2010 - younglim
# cohplot <- seq(begplot, endplot)
# ncohplot <- endplot - begplot + 1
# cohinds <- cohplot - begplot + 1

# ## relative educations in couples
# educman <- dcouples$EducHusband
# educwoman <- dcouples$EducWife
# yearmarr <- dcouples$YearMarried
# yearsurvey <- dcouples$YearSurvey
# 
# manmoreeduc <- numeric(ncohplot)
# manlesseduc <- numeric(ncohplot)
# maneqeduc <- numeric(ncohplot)
# 
# for (iyear in 1:ncohplot) {
#   eman <- educman[birthhusb == (begplot + iyear - 1)]
#   ewoman <- educwoman[birthhusb == (begplot + iyear - 1)]
#   manmoreeduc[iyear] <- mean(eman > ewoman)
#   maneqeduc[iyear] <- mean(eman == ewoman)
#   manlesseduc[iyear] <- mean(eman < ewoman)
# }
# 
# 
# ## plot
# pdf(file = paste0(resdir, "ComparEducsCouple",
#                   selectRace,
#                   younglim,
#                   ".pdf"))
# plot(cohplot,
#      manmoreeduc,
#      type = "l",
#      xlab = "Year of birth of husband",
#      main = paste("Comparing educations within", raceadj, "couples"),
#      ylim = c(0, 0.7),
#      ylab = "Proportion")
# lines(cohplot, maneqeduc, col = 2)
# lines(cohplot, manlesseduc, col = 3)
# legend("top",
#        c( "Husband more educated",
#           "Same education",
#           "Husband less educated"
#        ),
#        col = seq(3),
#        lty = rep(1, 3)
# )
# dev.off()



## now work on combined data

constructData <- function(gender) {
  ## for singles: year, hhwt, race, educ, age, sex (male 1, female 2), marst
  selsingle <- ((dsingles$Gender == gender) &
                  (dsingles$Age <= oldlim) &
                  (dsingles$Age >= younglim))
  nsinglesel <- sum(selsingle)
  dsinglesel <- dsingles[selsingle, ]
  ## construct data for single  shaped like the data for couples
  if (gender == 1) {
    newdsingle <-
      data.frame(
        dsinglesel$YearSurvey,
        dsinglesel$HHweight,
        dsinglesel$Race,
        numeric(nsinglesel),
        dsinglesel$Education,
        numeric(nsinglesel),
        dsinglesel$Age,
        matrix(0, nsinglesel, 2),
        dsinglesel$MarriageStatus
      )
  }
  if (gender == 2) {
    newdsingle <-
      data.frame(
        dsinglesel$YearSurvey,
        dsinglesel$HHweight,
        numeric(nsinglesel),
        dsinglesel$Race,
        numeric(nsinglesel),
        dsinglesel$Education,
        numeric(nsinglesel),
        dsinglesel$Age,
        numeric(nsinglesel),
        dsinglesel$MarriageStatus
      )
  }
  names(newdsingle) <- varNames
  ## now combine datasets
  if (reclassOld == 0) {
    ## concatenate couples and single
    newdata <- rbind(dcouples, newdsingle)
  }
  if (reclassOld == 1 & gender == 1) {
    names(singlemenreclass) <- varNames
    ## concatenate couples and single men (including reclassified)
    newdata <- rbind(dcouples, newdsingle, singlemenreclass)
  }
  if (reclassOld == 1 & gender == 2) {
    names(singlewomenreclass) <- varNames
    ## concatenate couples and single women (including reclassified)
    newdata <- rbind(dcouples, newdsingle, singlewomenreclass)
  }
  
  ### we return
  newdata
}


mendata[[irace]] <- constructData(1)
womendata[[irace]] <- constructData(2)

}

finalsample_csw <- bind_rows(mendata[[1]], mendata[[2]], womendata[[1]] %>% filter(MarriageStatus == 6), womendata[[2]] %>% filter(MarriageStatus == 6))

write.csv(finalsample_csw, file = "output_data/finalsample_csw.csv", row.names = FALSE, quote = FALSE)
